knitr document van Steensel lab
86 TFs were chosen to desing TF reporters for. In this analysis, I will analyse the motifs of the chosen TFs. The goal is to find out how unique the motifs are and how well they represent the motif landscape.
seq_logos <- list()
for(i in 1:nrow(all_motifs)) {
x <- all_motifs$motif_id[i]
file_x <- paste("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/library_design/lambert_2018/PWMs/", x, sep = "")
if (file.exists(file_x) == T) {
if (str_contains(x, "txt") == TRUE) {
pwm <- read.table(file_x, header = T) %>% dplyr::select(-Pos) %>% t()
seq_logos[i] <- list(pwm)
}
if (str_contains(x, "pcm") == TRUE) {
pwm <- read.table(file_x, header = F) %>% column_to_rownames("V1")
pwm <- as.matrix(pwm)
seq_logos[i] <- list(pwm)
}
if (str_contains(x, "jaspar") == TRUE) {
pwm <- read_jaspar(file_x)
pwm <- pwm@motif
# Transform to relative values
pwm <- colPercents(pwm)/100
pwm <- pwm[1:4,]
seq_logos[i] <- list(pwm)
}
}
}
# Rename pwms
names(seq_logos) <- all_motifs$motif_id
# Remove motifs for which pwms were not available
seq_logos <- seq_logos[!sapply(seq_logos,is.null)]
# Convert to universalmotif format
seq_logos_motif <- list()
for (i in names(seq_logos)) {
seq_logos_motif[[i]] <- universalmotif::convert_motifs(seq_logos[[i]])
seq_logos_motif[[i]]@name <- i
}
saveRDS(seq_logos, "/DATA/usr/m.trauernicht/projects/SuRE-TF/data/library_design/lambert_2018/seq_logos.rds")
# ## Alternative motif collection
# motifs <- convert_motifs(MotifDb)
# motifs <- filter_motifs(motifs,organism="Hsapiens")Aim: Show diversity of chosen motifs.
### UMAP analysis
set.seed(35425)
motif_cor <- motif_sim %>%
replace(is.na(.), 0)
motifs_selected <- rbind(gen1_motifs, gen2_motifs) %>%
mutate(motif_id = ifelse(tf == "SOX9", "M06107_1.94d.txt", motif_id)) %>%
mutate(motif_id = ifelse(tf == "SOX2", "M06121_1.94d.txt", motif_id))
saveRDS(motifs_selected, "/DATA/usr/m.trauernicht/projects/SuRE-TF/data/library_design/lambert_2018/motifs_selected.rds")
tf_umap <- umap(motif_cor) ## as(<dgTMatrix>, "dgCMatrix") is deprecated since Matrix 1.5-0; do as(., "CsparseMatrix") instead
colnames(tf_umap$layout) <- c("A","B")
tf_umap_plot <- tf_umap$layout %>%
as.data.frame() %>%
rownames_to_column(var = "motif_id") %>%
left_join(all_motifs) %>%
left_join(tf_clusters) %>%
#left_join(lambert_clusters) %>%
mutate(selected = ifelse(tf %in% motifs_selected$tf, "Yes", "No")) %>%
mutate(cluster2 = ifelse(cluster <= 10, cluster, ">10")) %>%
mutate(family2 = ifelse(family %in% c("bHLH", "bZIP", "Ets", "Forkhead", "GATA", "Homeodomain", "Nuclear receptor",
"T-box", "C2H2 ZF"), family, "other"))## Joining, by = "motif_id"
## Joining, by = "motif_id"
#Plot UMAP
ggplot(,aes(x = A, y = B)) +
stat_density2d(data = tf_umap_plot, geom = "polygon", aes(alpha = ..level..), bins = 5) +
geom_point(data = tf_umap_plot %>% filter(selected == "No"), size = 1 , color = "grey") +
geom_point(data = tf_umap_plot %>% filter(selected == "Yes"), size = 4 , color = "#E07858") +
theme_pubr() +
scale_alpha(range = c(0.05,0.15)) +
geom_text_repel(data = tf_umap_plot %>% filter(selected == "Yes"), aes(label = tf), force = 86) +
ylab("UMAP2") +
xlab("UMAP1")ggplot(,aes(x = A, y = B)) +
geom_point(data = tf_umap_plot %>% filter(selected == "No"), size = .4 , color = "grey") +
geom_point(data = tf_umap_plot %>% filter(selected == "Yes"), size = 1, color = "#E07858") +
theme_pubr() +
ylab("UMAP2") +
xlab("UMAP1")x <- ggplot(,aes(x = A, y = B, label = tf)) +
geom_point(data = tf_umap_plot %>% filter(selected == "No"), size = .4 , color = "grey") +
geom_point(data = tf_umap_plot %>% filter(selected == "Yes"), size = 1, color = "#E07858") +
theme_pubr() +
ylab("UMAP2") +
xlab("UMAP1")
ggplotly(x)ggplot(,aes(x = A, y = B)) +
stat_density2d(data = tf_umap_plot, geom = "polygon", aes(alpha = ..level..), bins = 5) +
geom_point(data = tf_umap_plot %>% filter(selected == "No"), size = 1 , aes(color = family2)) +
geom_point(data = tf_umap_plot %>% filter(selected == "Yes"), size = 4 , color = "#E07858") +
theme_pubr() +
scale_alpha(range = c(0.05,0.15)) +
geom_text_repel(data = tf_umap_plot %>% filter(selected == "Yes"), aes(label = tf), force = 86) +
ylab("UMAP2") +
xlab("UMAP1")Conclusion: Motifs represent large diversity within the complete motif collection.
motif_similarity_df <- data.frame(motif_sim) %>%
rownames_to_column("motif1") %>%
pivot_longer(cols = -motif1, names_to = "motif2", values_to = "similarity") %>%
mutate(similarity = as.numeric(similarity))
motif_similarity_df_selected <- motif_similarity_df %>%
filter(motif1 %in% motifs_selected$motif_id & motif2 %in% motifs_selected$motif_id) %>%
replace(is.na(.), 0)
motif_similarity_df_selected <- motif_similarity_df_selected %>%
left_join(motifs_selected %>% dplyr::select("motif1" = motif_id, "tf1" = tf)) %>%
left_join(motifs_selected %>% dplyr::select("motif2" = motif_id, "tf2" = tf)) %>%
distinct(tf1, tf2, similarity)## Joining, by = "motif1"
## Joining, by = "motif2"
motif_similarity_mat <- motif_similarity_df_selected %>%
spread(key = "tf2", value = "similarity") %>%
column_to_rownames("tf1")
ord <- hclust( dist(motif_similarity_mat, method = "euclidean"), method = "ward.D2" )$order
motif_similarity_df_selected <- motif_similarity_df_selected %>%
mutate(tf1 = factor(tf1, levels = rownames(motif_similarity_mat)[ord]),
tf2 = factor(tf2, levels = rownames(motif_similarity_mat)[ord]))
ggplot(motif_similarity_df_selected,
aes(x = tf1, y = tf2, fill = similarity)) +
geom_tile(size = .5) +
coord_fixed() +
theme_pubr(x.text.angle = 90, border = T) +
scale_fill_gradient2(low = "#B7B7A4", mid = "white", high = "#E17B5C", midpoint = 0.5)Conclusion: We have some TFs with overlapping motifs. However, those are mostly nuclear receptors that have specific stimulations.
## [1] "Run time: 1.057671 mins"
## [1] "/DATA/usr/m.trauernicht/projects/SuRE-TF/library_design"
## [1] "Wed May 15 16:27:27 2024"
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel grid stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] umap_0.2.8.0 MotifDb_1.32.0
## [3] ggpubr_0.4.0 biomaRt_2.46.3
## [5] ENCODExplorer_2.16.0 LncFinder_1.1.5
## [7] RcmdrMisc_2.7-2 sandwich_3.0-1
## [9] car_3.0-12 carData_3.0-5
## [11] sjmisc_2.8.9 ggrepel_0.9.1
## [13] ggbeeswarm_0.6.0 vwr_0.3.0
## [15] latticeExtra_0.6-29 lattice_0.20-41
## [17] stringdist_0.9.8 data.table_1.14.2
## [19] RColorBrewer_1.1-3 ggseqlogo_0.1
## [21] tibble_3.1.6 pheatmap_1.0.12
## [23] heatmaply_1.3.0 viridis_0.6.2
## [25] viridisLite_0.4.0 plotly_4.10.0
## [27] tidyr_1.2.0 stringr_1.4.0
## [29] readr_2.1.2 dplyr_1.0.8
## [31] magrittr_2.0.3 phylotools_0.2.2
## [33] ape_5.6-2 DNABarcodes_1.20.0
## [35] Matrix_1.5-1 gtools_3.9.2
## [37] SimRAD_0.96 zlibbioc_1.36.0
## [39] ShortRead_1.48.0 GenomicAlignments_1.26.0
## [41] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [43] MatrixGenerics_1.2.1 matrixStats_0.62.0
## [45] Rsamtools_2.6.0 GenomicRanges_1.42.0
## [47] GenomeInfoDb_1.26.7 BiocParallel_1.24.1
## [49] Biostrings_2.58.0 XVector_0.30.0
## [51] IRanges_2.24.1 S4Vectors_0.28.1
## [53] BiocGenerics_0.36.1 universalmotif_1.8.5
## [55] seqLogo_1.56.0 seqinr_4.2-8
## [57] ggplot2_3.4.0
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 rtracklayer_1.50.0
## [3] ModelMetrics_1.2.2.2 bit64_4.0.5
## [5] knitr_1.38 DelayedArray_0.16.3
## [7] rpart_4.1-15 hwriter_1.3.2.1
## [9] hardhat_0.2.0 RCurl_1.98-1.6
## [11] generics_0.1.2 RSQLite_2.2.12
## [13] proxy_0.4-26 future_1.25.0
## [15] bit_4.0.4 tzdb_0.3.0
## [17] webshot_0.5.3 xml2_1.3.3
## [19] lubridate_1.8.0 httpuv_1.6.5
## [21] isoband_0.2.5 assertthat_0.2.1
## [23] gower_1.0.0 xfun_0.30
## [25] hms_1.1.1 jquerylib_0.1.4
## [27] evaluate_0.15 promises_1.2.0.1
## [29] TSP_1.2-0 fansi_1.0.3
## [31] progress_1.2.2 dendextend_1.15.2
## [33] dbplyr_2.1.1 readxl_1.4.0
## [35] DBI_1.1.2 htmlwidgets_1.5.4
## [37] purrr_0.3.4 ellipsis_0.3.2
## [39] crosstalk_1.2.0 RSpectra_0.16-1
## [41] backports_1.4.1 insight_0.18.2
## [43] vctrs_0.5.1 sjlabelled_1.2.0
## [45] abind_1.4-5 caret_6.0-92
## [47] cachem_1.0.6 withr_2.5.0
## [49] checkmate_2.1.0 prettyunits_1.1.1
## [51] cluster_2.1.1 lazyeval_0.2.2
## [53] crayon_1.5.1 labeling_0.4.2
## [55] recipes_0.2.0 pkgconfig_2.0.3
## [57] nlme_3.1-152 vipor_0.4.5
## [59] seriation_1.3.5 nnet_7.3-17
## [61] rlang_1.0.6 globals_0.14.0
## [63] lifecycle_1.0.3 registry_0.5-1
## [65] BiocFileCache_1.14.0 AnnotationHub_2.22.1
## [67] cellranger_1.1.0 zoo_1.8-10
## [69] base64enc_0.1-3 beeswarm_0.4.0
## [71] png_0.1-7 bitops_1.0-7
## [73] splitstackshape_1.4.8 pROC_1.18.0
## [75] blob_1.2.3 parallelly_1.31.1
## [77] jpeg_0.1-9 rstatix_0.7.0
## [79] ggsignif_0.6.3 scales_1.2.0
## [81] memoise_2.0.1 plyr_1.8.7
## [83] compiler_4.0.5 cli_3.4.1
## [85] ade4_1.7-19 listenv_0.8.0
## [87] htmlTable_2.4.0 Formula_1.2-4
## [89] MASS_7.3-53.1 tidyselect_1.1.2
## [91] stringi_1.7.6 forcats_0.5.1
## [93] highr_0.9 yaml_2.3.5
## [95] askpass_1.1 sass_0.4.1
## [97] tools_4.0.5 future.apply_1.8.1
## [99] rstudioapi_0.13 BH_1.78.0-0
## [101] foreach_1.5.2 foreign_0.8-81
## [103] gridExtra_2.3 prodlim_2019.11.13
## [105] farver_2.1.0 digest_0.6.29
## [107] BiocManager_1.30.19 shiny_1.7.1
## [109] lava_1.6.10 nortest_1.0-4
## [111] Rcpp_1.0.8.3 broom_0.8.0
## [113] BiocVersion_3.12.0 later_1.3.0
## [115] httr_1.4.2 AnnotationDbi_1.52.0
## [117] Rdpack_2.3 colorspace_2.0-3
## [119] reticulate_1.24 XML_3.99-0.9
## [121] splines_4.0.5 xtable_1.8-4
## [123] jsonlite_1.8.0 timeDate_3043.102
## [125] ipred_0.9-12 R6_2.5.1
## [127] Hmisc_4.7-0 pillar_1.7.0
## [129] htmltools_0.5.2 mime_0.12
## [131] glue_1.6.2 fastmap_1.1.0
## [133] class_7.3-18 interactiveDisplayBase_1.28.0
## [135] codetools_0.2-18 utf8_1.2.2
## [137] bslib_0.3.1 curl_4.3.2
## [139] openssl_2.0.0 survival_3.2-10
## [141] rmarkdown_2.13 munsell_0.5.0
## [143] e1071_1.7-9 GenomeInfoDbData_1.2.4
## [145] iterators_1.0.14 haven_2.5.0
## [147] reshape2_1.4.4 gtable_0.3.0
## [149] rbibutils_2.2.8